# Module 4 Exercise — Date & Text Handling
# Practice with lubridate, stringr, and forcats (R4DS Chapters 14, 15, 16)

# ===========================
# SETUP: Load Required Packages
# ===========================

library(dplyr)
library(tibble)
library(lubridate)
library(stringr)    # R4DS Chapter 14: Strings
library(forcats)    # R4DS Chapter 16: Factors

# ===========================
# EXERCISE 1: Date Parsing Challenge
# ===========================

# Create a dataset with messy date formats (realistic clinical scenario)
ae_raw <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005", "001-006"),
  AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "DIZZINESS", "RASH", "COUGH"),
  AESTDTC_MESSY = c("2024-01-20", "25/01/2024", "01/18/2024", "20240122", "2024/01/25", "Jan 26, 2024"),
  RFSTDTC = rep("2024-01-15", 6)
)

# YOUR TASK: Clean and parse the dates
# 1. Create AESTDT by parsing AESTDTC_MESSY (hint: use case_when with different lubridate functions)
# 2. Create RFSTDT by parsing RFSTDTC
# 3. Calculate AESTDY using the formula: AESTDT - RFSTDT + 1

ae_dates <- ae_raw %>%
  mutate(
    # YOUR CODE HERE - parse AESTDTC_MESSY
    AESTDT = case_when(
      # Add your date parsing logic here
      # Hint: use str_detect() to identify patterns, then ymd(), dmy(), mdy() etc.
      TRUE ~ as.Date(NA)
    ),

    # YOUR CODE HERE - parse RFSTDTC
    RFSTDT = as.Date(NA),

    # YOUR CODE HERE - calculate study day
    AESTDY = as.numeric(NA)
  )

# Display results
print("Parsed dates and study days:")
print(ae_dates)

# ===========================
# EXERCISE 2: Study Day Categories
# ===========================

# Add study day categories to your dataset
ae_with_categories <- ae_dates %>%
  mutate(
    # YOUR CODE HERE - create study day categories
    STUDYDAY_PERIOD = case_when(
      # Add your logic here:
      # AESTDY <= 0 should be "Pre-treatment"
      # AESTDY 1-7 should be "Week 1"
      # AESTDY 8-14 should be "Week 2"
      # AESTDY 15-28 should "Month 1"
      # > 28 should be "After Month 1"
      # Missing should be "Unknown"
      TRUE ~ "Unknown"
    ),

    # YOUR CODE HERE - create early AE flag (within first 7 days)
    EARLY_AE = "N"
  )

# Display results
print("With study day categories:")
print(ae_with_categories)

# ===========================
# EXERCISE 3: String Cleaning Challenge
# ===========================

# Create messy adverse event terms
ae_messy_text <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005"),
  AEDECOD_RAW = c(
    "  mild headache  ",
    "SEVERE nausea (grade 3)",
    "fatigue - moderate",
    "  DIZZINESS mild  ",
    "skin rash (MODERATE)"
  ),
  MEDICATION = c("Ibuprofen 400mg", "ondansetron 8 MG", "caffeine 200mg", "rest", "hydrocortisone 1%")
)

# YOUR TASK: Clean and extract information
ae_cleaned <- ae_messy_text %>%
  mutate(
    # YOUR CODE HERE - clean adverse event terms
    AEDECOD_CLEAN = AEDECOD_RAW %>%
      # Step 1: Remove leading/trailing spaces
      # Step 2: Convert to uppercase
      # Step 3: Remove parentheses and contents
      # Step 4: Remove dashes and extra spaces
      # Step 5: Final trim
      str_trim(),

    # YOUR CODE HERE - extract severity
    SEVERITY = case_when(
      # Use str_detect with (?i) for case-insensitive matching
      # Check for "mild", "moderate", "severe" in AEDECOD_RAW
      TRUE ~ "UNKNOWN"
    ),

    # YOUR CODE HERE - extract base term (remove severity words)
    AETERM_BASE = AEDECOD_CLEAN %>%
      # Remove severity words from beginning and end
      str_to_upper(),

    # YOUR CODE HERE - create specific AE flags
    HEADACHE_FLAG = "N",
    NAUSEA_FLAG = "N",
    FATIGUE_FLAG = "N",

    # YOUR CODE HERE - extract numeric dose from medication
    DOSE_NUMERIC = as.numeric(str_extract(MEDICATION, "\\d+")),

    # YOUR CODE HERE - clean medication names
    MED_CLEAN = MEDICATION %>%
      # Remove dose information and clean
      str_to_upper()
  )

# Display results
print("Cleaned text data:")
print(ae_cleaned)

# ===========================
# EXERCISE 4: Regular Expressions Practice (R4DS Chapter 15)
# ===========================

# Create clinical data with patterns to validate
clinical_data <- tibble(
  USUBJID = c("001-001", "002-001", "001-ABC", "999-123", "001-99"),
  PHONE = c("(555) 123-4567", "555-123-4567", "555.123.4567", "5551234567", "invalid-phone"),
  EMAIL = c("investigator@clinic.com", "bad.email", "test@site.org", "missing@", "@incomplete.com"),
  LAB_RESULT = c("WBC: 7.2 K/uL", "Hemoglobin: 12.5 g/dL", "Glucose: 95 mg/dL", "Invalid result", ""),
  VISIT_DATE = c("2024-01-15", "2024/01/20", "15-Jan-2024", "invalid-date", "2024-13-45")
)

# YOUR TASK: Use regular expressions for validation
clinical_validated <- clinical_data %>%
  mutate(
    # YOUR CODE HERE - validate subject ID format (###-###)
    VALID_SUBJID = str_detect(USUBJID, "pattern"), # Add your regex pattern

    # YOUR CODE HERE - validate phone number formats
    VALID_PHONE = str_detect(PHONE, "pattern"), # Add regex for phone validation

    # YOUR CODE HERE - validate email addresses
    VALID_EMAIL = str_detect(EMAIL, "pattern"), # Add regex for email validation

    # YOUR CODE HERE - extract numeric values from lab results
    LAB_VALUE = as.numeric(str_extract(LAB_RESULT, "\\d+")), # Add regex pattern

    # YOUR CODE HERE - extract lab test name (before colon)
    LAB_TEST = str_extract(LAB_RESULT, "^[^:]+"), # Add regex pattern

    # YOUR CODE HERE - validate date formats (YYYY-MM-DD only)
    VALID_DATE_FORMAT = str_detect(VISIT_DATE, "pattern"), # Add regex pattern

    # YOUR CODE HERE - extract year from valid dates
    VISIT_YEAR = str_extract(VISIT_DATE, "\\d{4}") # Add regex pattern
  )

print("Regular expression validation results:")
print(clinical_validated)

# ===========================
# EXERCISE 5: Factor Management Practice (R4DS Chapter 16)
# ===========================

# Create clinical factors data
clinical_factors <- tibble(
  USUBJID = paste0("00", 1:8, "-", sprintf("%03d", 1:8)),
  SEVERITY = c("Mild", "Severe", "Moderate", "Mild", "Severe", "Moderate", "Mild", "Moderate"),
  TREATMENT = c("Placebo", "Low Dose", "High Dose", "Placebo", "Low Dose", "High Dose", "Placebo", "Low Dose"),
  OUTCOME = c("Recovered", "Ongoing", "Recovered", "Worsened", "Recovered", "Ongoing", "Recovered", "Ongoing"),
  VISIT_NAME = c("Screening", "Day 1", "Week 2", "Month 1", "Week 4", "End of Study", "Follow-up", "Unscheduled")
)

# YOUR TASK: Convert to appropriate factors
clinical_factors_clean <- clinical_factors %>%
  mutate(
    # YOUR CODE HERE - create ordered severity factor
    SEVERITY_FACTOR = factor(SEVERITY,
                           levels = c("Mild", "Moderate", "Severe"), # Fill in correct order
                           ordered = TRUE),

    # YOUR CODE HERE - create treatment factor with meaningful labels
    TREATMENT_FACTOR = factor(TREATMENT,
                            levels = c("Placebo", "Low Dose", "High Dose"), # Fill in levels
                            labels = c("Placebo", "Low Dose", "High Dose")), # Fill in labels

    # YOUR CODE HERE - create outcome factor
    OUTCOME_FACTOR = factor(OUTCOME), # Add levels if needed

    # YOUR CODE HERE - create visit factor with logical ordering
    VISIT_FACTOR = factor(VISIT_NAME,
                        levels = c("Screening", "Day 1", "Week 2", "Week 4", "Month 1", "End of Study", "Follow-up", "Unscheduled")), # Fill in chronological order

    # YOUR CODE HERE - create risk category based on severity and treatment
    RISK_CATEGORY = case_when(
      # Add logic combining severity and treatment
      # High risk: Severe + High Dose
      # Medium risk: Moderate + any dose, or Severe + Low Dose/Placebo
      # Low risk: Mild + any dose
      TRUE ~ "Medium"
    ) %>% factor(levels = c("Low", "Medium", "High"), ordered = TRUE)
  )

print("Factor management results:")
print(clinical_factors_clean)

# Factor manipulation exercises:
# YOUR CODE HERE - count levels
severity_counts <- table(clinical_factors_clean$SEVERITY_FACTOR)  # Fill in the factor

# YOUR CODE HERE - reorder treatment by frequency
treatment_reordered <- fct_infreq(clinical_factors_clean$TREATMENT_FACTOR)  # Fill in the factor

# YOUR CODE HERE - collapse rare visit types
visit_collapsed <- fct_collapse(clinical_factors_clean$VISIT_FACTOR,
                              "Early" = c("Screening", "Day 1"), # Fill in early visits
                              "Late" = c("End of Study", "Follow-up"),  # Fill in late visits
                              "Other" = c("Week 2", "Week 4", "Month 1", "Unscheduled")) # Fill in other visits

print("Factor manipulation results:")
print(severity_counts)
print(levels(treatment_reordered))
print(levels(visit_collapsed))

# ===========================
# EXERCISE 6: AESTDY Derivation Practice
# ===========================

# Create a more complex dataset for AESTDY practice
complex_ae <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-002", "001-003"),
  AESEQ = c(1, 2, 1, 2, 1),
  AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "HEADACHE", "DIZZINESS"),
  AESTDTC = c("2024-01-20T08:30", "2024-01-25T14:15", "2024-01-18T09:00", "2024-01-22T16:30", NA),
  AEENDTC = c("2024-01-21T10:00", "2024-01-26T08:00", "2024-01-20T18:00", "2024-01-23T12:00", NA),
  RFSTDTC = c("2024-01-15T09:00", "2024-01-15T09:00", "2024-01-16T10:00", "2024-01-16T10:00", "2024-01-15T09:00")
)

# YOUR TASK: Derive comprehensive study day variables
complex_ae_derived <- complex_ae %>%
  mutate(
    # YOUR CODE HERE - parse start dates/times
    AESTDT = as.Date(NA),
    AEENDT = as.Date(NA),
    RFSTDT = as.Date(NA),

    # YOUR CODE HERE - calculate study days
    AESTDY = as.numeric(NA),
    AEENDY = as.numeric(NA),

    # YOUR CODE HERE - calculate duration
    AE_DURATION_DAYS = as.numeric(NA),

    # YOUR CODE HERE - handle missing dates
    AESTDY_SAFE = case_when(
      # Add logic to handle missing dates appropriately
      TRUE ~ as.numeric(NA)
    ),

    # YOUR CODE HERE - create validation flags
    VALID_DATES = case_when(
      # Check for data quality issues
      TRUE ~ "Complete"
    )
  )

# Display results
print("Complex AESTDY derivations:")
print(complex_ae_derived)

# ===========================
# EXERCISE 7: GitHub Copilot in RStudio Practice
# ===========================

# Try writing these comments and let Copilot help in RStudio:

# Convert datetime string to date only


# Flag weekend adverse events


# Calculate time between AE start and end in hours


# Extract the first word from AEDECOD using regex


# Create ordered factor for dose escalation levels


# Validate clinical trial site ID format with regex


# ===========================
# BONUS: Combined Date, Text, Regex, and Factors Challenge
# ===========================

# Combine everything you've learned from R4DS Chapters 14, 15, 16
final_challenge <- complex_ae_derived %>%
  mutate(
    # YOUR CODE HERE - create comprehensive AE description with factors
    AE_DESCRIPTION = paste0(
      # Combine: AEDECOD, study day, duration
      "AE Description"
    ),

    # YOUR CODE HERE - create analysis-ready flags
    ONGOING_AE = "N",
    EARLY_ONSET = "N",
    LONG_DURATION = "N"
  )

# ===========================
# SUMMARY STATISTICS
# ===========================

# Calculate summary statistics
cat("\n=== EXERCISE SUMMARY ===\n")
cat("AEs by study day period:\n")
print(table(ae_with_categories$STUDYDAY_PERIOD, useNA = "ifany"))

cat("\nAEs by severity:\n")
print(table(ae_cleaned$SEVERITY, useNA = "ifany"))

cat("\nEarly AEs (≤7 days):\n")
print(sum(ae_with_categories$EARLY_AE == "Y", na.rm = TRUE))

# ===========================
# EXERCISE COMPLETE!
# ===========================

cat("\n🎉 Module 4 Exercise Complete!\n")
cat("You practiced R4DS concepts:\n")
cat("- Parsing different date formats with lubridate\n")
cat("- Calculating study days (AESTDY)\n")
cat("- String manipulation with stringr (R4DS Ch. 14)\n")
cat("- Regular expressions for data validation (R4DS Ch. 15)\n")
cat("- Factor management with forcats (R4DS Ch. 16)\n")
cat("- Clinical pattern matching and validation\n")
cat("- Ordered factors for severity and risk assessment\n")
cat("- Combined dates, strings, regex, and factors\n")
cat("\nExcellent work! Ready for Module 5: Functions & Macro Translation!\n")
